We are using the SHIVA trial to run a retrospective analysis to validate the PTD package. During the trial, the hormon receptors ER, AR and PR were colelcted using Immunohistochemistry (IHC). In the retrospective analisi we used the expression (as z-score) values collected from the RNA-seq data TCGA dataset. In this section we want to explore the relationship between RNA-seq and Immunohistochemistry and, possibly, identify a “entrance” threshold that we can use in the simulation.

!!!EXPLAIN BETTER HERE!!!

Comparative analysis

To run the comparison analysis we will need two datasets:

Both dataset need to have in common the same patients so that we can reconstruct the index.

The analysis will be run on two hormon receptos:

ER analysis

IHC dataset

As Input dataset we are choosing to use:

Clinical data downloaded from cBioportal for the dataset: Breast Invasive Carcinoma (TCGA, Cell 2015) - LINK

The dataset has been downloade and stored as brca_tcga_pub2015_clinical_data.tsv.

library(dplyr)
library(ggplot2)
library(plotly)
library(readr)
library(knitr)
ihc <-readr::read_tsv("../external_resources/brca_tcga_pub2015_clinical_data.tsv")
ihcFilter <- ihc %>% 
  dplyr::select(`Patient ID`
                #, `Sample ID`
                , `ER Status By IHC`
                , `ER Status IHC Percent Positive`
                #, `ER positivity scale used`
                #, `ER positivity scale other`
                #, `IHC Score`
                #, `IHC-HER2`
                #, `PR positivity scale used`
                #, `PR status by ihc`
                #, `PR status ihc percent positive`
                ) %>%
  dplyr::filter(!is.na(`ER Status By IHC`)) %>% # Remove the <NA>
  dplyr::filter(!is.na(`ER Status IHC Percent Positive`)) %>% # Remove the <NA>
  dplyr::rename(case_id=`Patient ID`, er_status=`ER Status By IHC`, ihc_value=`ER Status IHC Percent Positive`)
 
# preview
kable(head(ihcFilter), caption="top 6 rows")
case_id er_status ihc_value
TCGA-A2-A3XV Positive <10%
TCGA-A2-A3Y0 Positive 90-99%
TCGA-LL-A50Y Positive 90-99%
TCGA-LL-A5YP Positive <10%
TCGA-LL-A5YL Positive 90-99%
TCGA-LL-A5YM Positive 90-99%

RNA-seq dataset

The RNA-seq dataset was extracted using PTD function.!

case_id expressionValue
TCGA-A1-A0SB -0.9404
TCGA-A1-A0SD -0.1790
TCGA-A1-A0SE -0.6355
TCGA-A1-A0SF -0.3363
TCGA-A1-A0SH -0.9036
TCGA-A1-A0SI -0.6584

Inner Join datasets

df <- dplyr::inner_join(rnaseq, ihcFilter, by="case_id")
DT::datatable(df
          # ADD BUTTONS TO THE TABLE
          , extensions = 'Buttons'
          , options = list(
               dom = 'lBfrtip'
              , buttons = c('copy', 'csv', 'excel')
              )
          , caption = "Comparison between Missing and Submitted regions (bp) in the panel"
          )

Comparison Analysis

Explore RNA-seq Z-score

# explore z-score value
p1 <- ggplot(df, aes(x=expressionValue)) +
        geom_density(kernel="gaussian") + 
        geom_vline(aes(xintercept=0.3, color="red")) + 
        labs(x="Expression z-scores", title="Rna-seq expression density plot") + 
        theme(legend.position = "none", plot.title=element_text(size=10))
p1

Explore ICH values

# barplot 
p2 <- ggplot(data=df, aes(x=ihc_value)) + 
        geom_bar(stat = "count", position = "stack") + 
        labs(title="Barplot with COUNT of patients in each ER ICH expression value (from 0 to 100%)")+
        theme(legend.position = "none", plot.title=element_text(size=10))
p2

Compare

p3 <- ggplot(data=df, aes(x=ihc_value, y=expressionValue, group=1)) +
    geom_point(colour="red", size=1, shape=21, fill="white") +
    labs(title="Comparison between RNA-seq and IHC values for ER in Breast cancer") +
    xlab("IHC value") +
    ylab("RNA-seq z-score") +
    geom_smooth(method="lm") +
    geom_hline(yintercept =0.3) +
    theme(legend.position = "none", plot.title=element_text(size=10))
ggplotly(p3,width = 650, height = 400, margin(t=1000))

Fit to a linear model

# Convert chategorical values to continue numerical value 
# <10% = 1
# 10-19% = 2
# etc..
df$ihc_value2 <- as.numeric(factor(df$ihc_value))
# Fit the data into a linea regressino model ache chek the coefficients
summary(lm(df$expressionValue ~ ihc_value2, df))

Call:
lm(formula = df$expressionValue ~ ihc_value2, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.1511 -0.5160 -0.1985  0.3118  3.6848 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.91121    0.10598  -8.598 4.32e-16 ***
ihc_value2   0.10991    0.01292   8.510 7.99e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.7788 on 305 degrees of freedom
Multiple R-squared:  0.1919,    Adjusted R-squared:  0.1892 
F-statistic: 72.42 on 1 and 305 DF,  p-value: 7.986e-16

There is a significant linear relationship between the predictor and the outcome. Although the \(R^2\) value is very low (\(R^2\) indicates the percentage of total variation explained by the linear relationship with the predictors).

  • Pearson correlation: 0.4380399

Put all the plots together

Error: `device` must be NULL, a string or a function.

PR analysis

IHC dataset

ihcPRFilter <- ihc %>% 
  dplyr::select(`Patient ID`
                #, `Sample ID`
                #, `ER Status By IHC`
                #, `ER Status IHC Percent Positive`
                #, `ER positivity scale used`
                #, `ER positivity scale other`
                #, `IHC Score`
                #, `IHC-HER2`
                #, `PR positivity scale used`
                , `PR status by ihc`
                , `PR status ihc percent positive`
                ) %>%
  dplyr::filter(!is.na(`PR status by ihc`)) %>% # Remove the <NA>
  dplyr::filter(!is.na(`PR status ihc percent positive`)) %>% # Remove the <NA>
  dplyr::rename(case_id=`Patient ID`, pr_status=`PR status by ihc`, ihc_value=`PR status ihc percent positive`)

RNA-seq dataset

The RNA-seq dataset was extracted using PTD function.!

panel_design <- data.frame(drug=""
    , gene_symbol="PGR"
    , alteration="expression"
    , exact_alteration="up"
    ,   mutation_specification=""
    ,   group="")


panel <- newCancerPanel(panel_design)
panel <- getAlterations(panel, tumor_type = "brca_tcga_pub2015")
panel <- subsetAlterations(panel)

# Load data from SHIVA retrospective analaysis
#panel <- readRDS("../Temp/shiva_panel.rds")
# Fetch data
rnaseq_PR <- panel@dataFull$expression$data %>%
  filter(tumor_type == "brca") %>%
  filter(gene_symbol == "PGR") %>%
  select(case_id, expressionValue)

Inner join datasets

# join
dfPR <- dplyr::inner_join(rnaseq_PR, ihcPRFilter, by="case_id")

Comparison analysis

Explore RNA-seq Z-score

p1 <- ggplot(dfPR, aes(x=expressionValue)) +
        geom_density(kernel="gaussian") + 
        geom_vline(aes(xintercept=0.3, color="red")) + 
        labs(x="Expression z-scores", title="Rna-seq expression density plot") + 
        theme(legend.position = "none", plot.title=element_text(size=10))
p1

Explore ICH values

p2 <- ggplot(data=dfPR, aes(x=ihc_value)) + 
        geom_bar(stat = "count", position = "stack") + 
        labs(title="Barplot with COUNT of patients in each PR ICH expression value (from 0 to 100%)")+
        theme(legend.position = "none", plot.title=element_text(size=10))
p2

Compare

p3 <- ggplot(data=dfPR, aes(x=ihc_value, y=expressionValue, group=1)) +
    geom_point(colour="red", size=1, shape=21, fill="white") +
    labs(title="Comparison between RNA-seq and IHC values for PR in Breast cancer") +
    xlab("IHC value") +
    ylab("RNA-seq z-score") +
    geom_smooth(method="lm") +
    geom_hline(yintercept =0.3) +
    theme(legend.position = "none", plot.title=element_text(size=10))

ggplotly(p3,width = 650, height = 400, margin(t=1000))

Fit to a linear model

# Convert chategorical values to continue numerical value 
# <10% = 1
# 10-19% = 2
# etc..
dfPR$ihc_value2 <- as.numeric(factor(dfPR$ihc_value))
# Fit the data into a linea regressino model ache chek the coefficients
summary(lm(expressionValue ~ ihc_value2, dfPR))

Put all the plots together

---
title: "Comparison between RNA-seq and Immunohistochemistry data"
output:
  html_document:
    code_folding: hide
    fig_height: 8
    fig_width: 10
    highlight: haddock
    theme: readable
    toc: yes
    toc_depth: 3
    toc_float: yes
  html_notebook: default
---

We are using the SHIVA trial to run a retrospective analysis to validate the PTD package. During the trial, the hormon receptors ER, AR and PR were colelcted using Immunohistochemistry (IHC). In the retrospective analisi we used the expression (as z-score) values collected from the RNA-seq data TCGA dataset. In this section we want to explore the relationship between RNA-seq and Immunohistochemistry and, possibly, identify a "entrance" threshold that we can use in the simulation.

!!!EXPLAIN BETTER HERE!!!

Comparative analysis {.tabset}
================================================================================


To run the comparison analysis we will need two datasets:

* Dataset with IHC values, 
* Dataset with RNA-seq expresison values

Both dataset need to have in common the same patients so that we can reconstruct the index.

The analysis will be run on two hormon receptos: 

* ER
* PR


ER analysis 
--------------------------------------------------------------------------------

### IHC dataset

As Input dataset we are choosing to use: 

Clinical data downloaded from cBioportal for the dataset: **Breast Invasive Carcinoma (TCGA, Cell 2015)** - [LINK](https://git.ieo.eu/acc-bioinfo/meta/activity)

The dataset has been downloade and stored as  ```brca_tcga_pub2015_clinical_data.tsv```. 

```{r, message=FALSE, warning=FALSE}
library(dplyr)
library(ggplot2)
library(plotly)
library(readr)
library(knitr)
library(PrecisionTrialDesigner)

ihc <-readr::read_tsv("../external_resources/brca_tcga_pub2015_clinical_data.tsv")
ihcFilter <- ihc %>% 
  dplyr::select(`Patient ID`
                #, `Sample ID`
                , `ER Status By IHC`
                , `ER Status IHC Percent Positive`
                #, `ER positivity scale used`
                #, `ER positivity scale other`
                #, `IHC Score`
                #, `IHC-HER2`
                #, `PR positivity scale used`
                #, `PR status by ihc`
                #, `PR status ihc percent positive`
                ) %>%
  dplyr::filter(!is.na(`ER Status By IHC`)) %>% # Remove the <NA>
  dplyr::filter(!is.na(`ER Status IHC Percent Positive`)) %>% # Remove the <NA>
  dplyr::rename(case_id=`Patient ID`, er_status=`ER Status By IHC`, ihc_value=`ER Status IHC Percent Positive`)
 
# preview
kable(head(ihcFilter), caption="top 6 rows")
```


### RNA-seq dataset

The RNA-seq dataset was extracted using PTD function.!

```{r}
panel_design <- data.frame(drug=""
    , gene_symbol="ESR1"
    , alteration="expression"
    , exact_alteration="up"
    ,	mutation_specification=""
    ,	group="")


panel <- newCancerPanel(panel_design)
panel <- getAlterations(panel, tumor_type = "brca_tcga_pub2015")
panel <- subsetAlterations(panel)

# Load data from SHIVA retrospective analaysis
#panel <- readRDS("../Temp/shiva_panel.rds")

# Fetch data
rnaseq <- panel@dataFull$expression$data %>%
  filter(tumor_type == "brca") %>%
  filter(gene_symbol == "ESR1") %>%
  select(case_id, expressionValue)

# Preview
kable(head(rnaseq), caption = "top 6 rows")
```


### Inner Join datasets

```{r}
df <- dplyr::inner_join(rnaseq, ihcFilter, by="case_id")
DT::datatable(df
          # ADD BUTTONS TO THE TABLE
          , extensions = 'Buttons'
          , options = list(
               dom = 'lBfrtip'
              , buttons = c('copy', 'csv', 'excel')
              )
          , caption = "Comparison between Missing and Submitted regions (bp) in the panel"
          )
```


### Comparison Analysis



#### Explore RNA-seq Z-score 

```{r}
# explore z-score value
p1 <- ggplot(df, aes(x=expressionValue)) +
        geom_density(kernel="gaussian") + 
        geom_vline(aes(xintercept=0.3, color="red")) + 
        labs(x="Expression z-scores", title="Rna-seq expression density plot") + 
        theme(legend.position = "none", plot.title=element_text(size=10))
p1


ggsave(filename="../Figures/fig_extra1.svg", plot=p1, device = "svg")
```

#### Explore ICH values

```{r}
# barplot 
p2 <- ggplot(data=df, aes(x=ihc_value)) + 
        geom_bar(stat = "count", position = "stack") + 
        labs(title="Barplot with COUNT of patients in each ER ICH expression value (from 0 to 100%)")+
        theme(legend.position = "none", plot.title=element_text(size=10))
p2

ggsave(filename="../Figures/fig_extra2.svg", plot=p2, device = "svg")
```


#### Compare

```{r, fig.width=7}
p3 <- ggplot(data=df, aes(x=ihc_value, y=expressionValue, group=1)) +
    geom_point(colour="red", size=1, shape=21, fill="white") +
    labs(title="Comparison between RNA-seq and IHC values for ER in Breast cancer") +
    xlab("IHC value") +
    ylab("RNA-seq z-score") +
    geom_smooth(method="lm") +
    geom_hline(yintercept =0.3) +
    theme(legend.position = "none", plot.title=element_text(size=10))

ggplotly(p3,width = 650, height = 400, margin(t=1000))

ggsave(filename="../Figures/fig_extra3.svg", plot=p3, device = "svg")
```


#### Fit to a linear model


```{r}
# Convert chategorical values to continue numerical value 
# <10% = 1
# 10-19% = 2
# etc..
df$ihc_value2 <- as.numeric(factor(df$ihc_value))
# Fit the data into a linea regressino model ache chek the coefficients
summary(lm(df$expressionValue ~ ihc_value2, df))
```

There is a significant linear relationship between the predictor and the outcome. Although the $R^2$ value is very low ($R^2$ indicates the percentage of total variation explained by the linear relationship with the predictors). 

*  Pearson correlation: **`r cor(as.numeric(df$ihc_value2), df$expressionValue)`**

### Put all the plots together


```{r, fig.height=8, fig.width=12, echo=FALSE}
library(grid)
library(gridExtra)
grid.newpage()
pushViewport(viewport(layout = grid.layout(2, 2)))

print(p1 + coord_flip(), vp = viewport(layout.pos.row = 1, layout.pos.col = 1))
print(p3, vp = viewport(layout.pos.row = 1, layout.pos.col = 2))
print(p2, vp = viewport(layout.pos.row = 2, layout.pos.col = 2))
```


PR analysis
--------------------------------------------------------------------------------

### IHC dataset

```{r}
ihcPRFilter <- ihc %>% 
  dplyr::select(`Patient ID`
                #, `Sample ID`
                #, `ER Status By IHC`
                #, `ER Status IHC Percent Positive`
                #, `ER positivity scale used`
                #, `ER positivity scale other`
                #, `IHC Score`
                #, `IHC-HER2`
                #, `PR positivity scale used`
                , `PR status by ihc`
                , `PR status ihc percent positive`
                ) %>%
  dplyr::filter(!is.na(`PR status by ihc`)) %>% # Remove the <NA>
  dplyr::filter(!is.na(`PR status ihc percent positive`)) %>% # Remove the <NA>
  dplyr::rename(case_id=`Patient ID`, pr_status=`PR status by ihc`, ihc_value=`PR status ihc percent positive`)
```

### RNA-seq dataset

The RNA-seq dataset was extracted using PTD function.!

```{r}
panel_design <- data.frame(drug=""
    , gene_symbol="PGR"
    , alteration="expression"
    , exact_alteration="up"
    ,	mutation_specification=""
    ,	group="")


panel <- newCancerPanel(panel_design)
panel <- getAlterations(panel, tumor_type = "brca_tcga_pub2015")
panel <- subsetAlterations(panel)

# Load data from SHIVA retrospective analaysis
#panel <- readRDS("../Temp/shiva_panel.rds")
# Fetch data
rnaseq_PR <- panel@dataFull$expression$data %>%
  filter(tumor_type == "brca") %>%
  filter(gene_symbol == "PGR") %>%
  select(case_id, expressionValue)
```

### Inner join datasets

```{r}
# join
dfPR <- dplyr::inner_join(rnaseq_PR, ihcPRFilter, by="case_id")
```


### Comparison analysis

#### Explore RNA-seq Z-score 

```{r}
p1 <- ggplot(dfPR, aes(x=expressionValue)) +
        geom_density(kernel="gaussian") + 
        geom_vline(aes(xintercept=0.3, color="red")) + 
        labs(x="Expression z-scores", title="Rna-seq expression density plot") + 
        theme(legend.position = "none", plot.title=element_text(size=10))
p1
```


#### Explore ICH values

```{r}
p2 <- ggplot(data=dfPR, aes(x=ihc_value)) + 
        geom_bar(stat = "count", position = "stack") + 
        labs(title="Barplot with COUNT of patients in each PR ICH expression value (from 0 to 100%)")+
        theme(legend.position = "none", plot.title=element_text(size=10))
p2
```



#### Compare

```{r, fig.width=7}
p3 <- ggplot(data=dfPR, aes(x=ihc_value, y=expressionValue, group=1)) +
    geom_point(colour="red", size=1, shape=21, fill="white") +
    labs(title="Comparison between RNA-seq and IHC values for PR in Breast cancer") +
    xlab("IHC value") +
    ylab("RNA-seq z-score") +
    geom_smooth(method="lm") +
    geom_hline(yintercept =0.3) +
    theme(legend.position = "none", plot.title=element_text(size=10))

ggplotly(p3,width = 650, height = 400, margin(t=1000))
```


#### Fit to a linear model

```{r}
# Convert chategorical values to continue numerical value 
# <10% = 1
# 10-19% = 2
# etc..
dfPR$ihc_value2 <- as.numeric(factor(dfPR$ihc_value))
# Fit the data into a linea regressino model ache chek the coefficients
summary(lm(expressionValue ~ ihc_value2, dfPR))
```

### Put all the plots together


```{r, fig.height=8, fig.width=12, echo=FALSE}
library(grid)
grid.newpage()
pushViewport(viewport(layout = grid.layout(2, 2)))

print(p1 + coord_flip(), vp = viewport(layout.pos.row = 1, layout.pos.col = 1))
print(p3, vp = viewport(layout.pos.row = 1, layout.pos.col = 2))
print(p2, vp = viewport(layout.pos.row = 2, layout.pos.col = 2))

```